package com.spider.ask120.magic.WebMagic;

import java.util.List;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Selectable;
/**
 * 基本种类
 * 描        述：
 * 创建时间：2016-8-16
 * @author Jibaole
 */
public class JibingAction implements PageProcessor{
	static String jbUrl = "http://m.120ask.com/jibing/";
	static String urlList = "http://m.120ask.com/jibing/\\w+";
	static String shiliao = "http://m.120ask.com/jibing/\\w+/shiliao/";
	static String jieshao = "http://m.120ask.com/jibing/\\w+/jieshao/";
	
	static String bingyin = "http://m.120ask.com/jibing/\\w+/bingyin/";
	static String zhengzhuang = "http://m.120ask.com/jibing/\\w+/zhengzhuang/";
	static String jianbie = "http://m.120ask.com/jibing/\\w+/jianbie/";
	static String bingfa = "http://m.120ask.com/jibing/\\w+/bingfa/";
	static String yufang = "http://m.120ask.com/jibing/\\w+/yufang/";
	static String zhiliao = "http://m.120ask.com/jibing/\\w+/zhiliao/";
	
	//药品方案
	static String fangan = "http://m.120ask.com/yaopin/programme/[0-9]+";
	
		
	// 部分一：抓取网站的相关配置，包括编码、抓取间隔、重试次数等
	private Site site = Site.me().setRetryTimes(3).setSleepTime(1);
		
	public Site getSite() { 
		return site;
	}

	//定制爬虫逻辑的核心接口
	public void process(Page page) {
		if (page.getUrl().regex(jbUrl+"$").match()) {
			boolean flag = true;
			int di = 1;
			int ci = 2;
			while (flag) {
				List<String> departments = page.getHtml().xpath("//div[@class='dis_search-div1']["+di+"]/b/text()").all();
				if (departments.size() == 0) {
					break;
				}
				System.out.println("departments" + departments.toString());
				di+=2;
				List<String> content = page.getHtml().xpath("//div[@class='dis_search-div2']["+ci+"]/p/a/span/text()").all();
				System.out.println("content" + content.toString());
				ci+=2;
				List<String> list = page.getHtml().xpath("//div[@class='dis_search-div2']/p/a/@href").all();
				System.out.println(list.toString());
				page.addTargetRequests(list);
				
			}
			List<String> list = page.getHtml().links().regex(urlList).all();
			page.addTargetRequests(list);
		}else if(page.getUrl().regex(urlList).match()){
			if (page.getUrl().regex(jieshao).match()) {
				System.out.println("jieshao:"+page.getUrl());
				page.putField("jieshao_first_view_name", page.getHtml().xpath("//div[@class='first-view']/div[@class='lemma-head clearfix']/h1/text()"));//疾病
				page.putField("jieshao_first_view_image", page.getHtml().xpath("//div[@class='first-view']/div[@class='card']/div[@class='clearfix']/div[@class='card-img']/img/@src"));//图片
				page.putField("jieshao_first_view_content", page.getHtml().xpath("//div[@class='first-view']/div[@class='card']/div[@class='clearfix']/div[@class='summary']/html()"));//介绍
				page.putField("jieshao_first_view_departments", page.getHtml().xpath("//div[@class='first-view']/div[@class='card']/p[1]/text()").all());//就诊科室
				//词条正文
//				page.putField("jieshao_content", page.getHtml().xpath("//div[@class='content']/html()"));
				String html = page.getHtml().xpath("//div[@class='content']/html()").get();
				int i = html.indexOf("<h3>最新文章</h3>");
				int m = html.indexOf("<a name=\"3\" id=\"content3\"></a>");
				if (i != -1 && m != -1) {
					System.out.println("jieshao_content");
					System.out.println(html.substring(0, i));
					System.out.println(html.substring(m,html.length()));
				}
				 
			}else if(page.getUrl().regex(shiliao).match()){
				System.out.println("shiliao:"+page.getUrl());
				//疾病
				page.putField("shiliao_jibing", page.getHtml().xpath("//div[@class='introduc_title']/b/text()"));
				//宜吃饮食必看
				page.putField("shiliao_eat", page.getHtml().xpath("//div[@class='introduc_cont1']/span/b/text()"));
				page.putField("shiliao_eat_read", page.getHtml().xpath("//div[@class='introduc_cont1']/p[@class='introduc_cont1-p']/text()"));//必看
				int i = 2;
				while (true) {
					String shiliao_eat_food_list = page.getHtml().xpath("//div[@class='introduc_cont1']/table[@class='introduc_cont1-table']/tbody/tr["+i+"]/td[1]/text()").get();
					if (null == shiliao_eat_food_list) {
						break;
					}
					System.out.println("shiliao_eat_food: "+shiliao_eat_food_list);//食物
					System.out.println("shiliao_eat_food_reason: "
							+ page.getHtml().xpath("//div[@class='introduc_cont1']/table[@class='introduc_cont1-table']/tbody/tr["+i+"]/td[2]/text()"));//理由
					System.out.println("shiliao_eat_food_advance: "
							+ page.getHtml().xpath("//div[@class='introduc_cont1']/table[@class='introduc_cont1-table']/tbody/tr["+i+"]/td[3]/text()"));//建议
					i++;
				}
				//忌吃必看
				page.putField("shiliao_can't_eat", page.getHtml().xpath("//div[@class='introduc_cont1 introduc_cont2']/span/b/text()"));
				page.putField("shiliao_can't_eat_read", page.getHtml().xpath("//div[@class='introduc_cont1 introduc_cont2']/p/text()"));
				i = 2;
				while (true) {
					String shiliao_eat_food_list = page.getHtml().xpath("//div[@class='introduc_cont1 introduc_cont2']/table[@class='introduc_cont1-table']/tbody/tr["+i+"]/td[1]/text()").get();
					if (null == shiliao_eat_food_list) {
						break;
					}
					System.out.println("shiliao_can't_eat_food: "+shiliao_eat_food_list);//食物
					System.out.println("shiliao_can't_eat_food_reason: "
							+ page.getHtml().xpath("//div[@class='introduc_cont1 introduc_cont2']/table[@class='introduc_cont1-table']/tbody/tr["+i+"]/td[2]/text()"));//理由
					System.out.println("shiliao_can't_eat_food_advance: "
							+ page.getHtml().xpath("//div[@class='introduc_cont1 introduc_cont2']/table[@class='introduc_cont1-table']/tbody/tr["+i+"]/td[3]/text()"));//建议
					i++;
				}
				
			}else if(page.getUrl().regex(bingyin).match() ||
					 page.getUrl().regex(zhengzhuang).match() ||
					 page.getUrl().regex(jianbie).match() ||
					 page.getUrl().regex(bingfa).match() || 
					 page.getUrl().regex(yufang).match() ||
					 page.getUrl().regex(zhiliao).match()){
				System.out.println("short_title:"+page.getUrl());
				page.putField("short_department", page.getHtml().xpath("//section/h3/text()"));//病种
				page.putField("short_content", page.getHtml().xpath("//section/div/html()"));//详情
				page.putField("related_disease", page.getHtml().xpath("//section/div/a/text()").all());//相关疾病
			}else{
				System.out.println("gaishu:"+page.getUrl());
				page.addTargetRequest(page.getHtml().links().regex(jieshao).get());
				page.addTargetRequest(page.getHtml().links().regex(shiliao).get());
				List<String> list = page.getHtml().xpath("//ul[@class='clears s_left_sort']/li/a/@href").all();
				page.addTargetRequests(list);
				page.putField("department", page.getHtml().xpath("//section/h3/text()"));//病种
				page.putField("abstract", page.getHtml().xpath("//section[@class='s_feritin img'][1]/div/dl/dd/text()"));//简介
				boolean has_drug = true;
				int i = 1;
				//用药方案
				while(has_drug){
					List<String> drug_name = page.getHtml().xpath("//div[@class='card']/div[@class='clearfix']/div["+i+"]/img/@alt").all();
					if(0 == drug_name.size()){
						break;
					}
					System.out.println("drug_name:"+drug_name);
					List<String> drug_img = page.getHtml().xpath("//div[@class='card']/div[@class='clearfix']/div["+i+"]/img/@src").all();
					System.out.println("drug_img:"+drug_img);
					i++;
				}
				//适用于
				page.putField("drug_shiyong", page.getHtml().xpath("//div[@class='card']/p/text()"));
				//方案url
				Selectable fangan_url = page.getHtml().xpath("//div[@class='card']/p/a/@href");
				page.putField("fangan_url", fangan_url);
				if (null != fangan_url) {
					page.addTargetRequest(fangan_url.toString());
				}
				//医院必看
				int j = 1;
				while(true){
					String fangan_hosp_bikan = page.getHtml().xpath("//section[@class='s_Tohospital t20']/ul/li["+j+"]/b/text()").get();
					if (null == fangan_hosp_bikan) {
						break;
					}
					System.out.println("fangan_hosp_bikan: "+fangan_hosp_bikan);
					String fangan_hosp_bikan_content = "";
					if (j ==2 || j == 4 || j ==5 || j ==7) {
						fangan_hosp_bikan_content = page.getHtml().xpath("//section[@class='s_Tohospital t20']/ul/li["+j+"]/span/a/text()").all().toString();
					}else {
						fangan_hosp_bikan_content = page.getHtml().xpath("//section[@class='s_Tohospital t20']/ul/li["+j+"]/span/text()").all().toString();
					}
					System.out.println("fangan_hosp_bikan: "+fangan_hosp_bikan_content);
					j++;
				}
				//常用药
				i = 1;
				while(true){
					String fangan_changyong_name = page.getHtml().xpath("//section[@class='s_sodium']/a["+i+"]/dl/dd/h3/text()").get();
					if (null == fangan_changyong_name) {
						break;
					}
					System.out.println("jibing_changyong_name: "+fangan_changyong_name);//药品名
					System.out.println("jibing_changyong_image: "+page.getHtml().xpath("//section[@class='s_sodium']/a["+i+"]/dl/dt/img/@src"));//图片
					System.out.println("jibing_changyong_indications: "+page.getHtml().xpath("//section[@class='s_sodium']/a["+i+"]/dl/dd[2]/text()"));//功能主治
					System.out.println("jibing_changyong_company: "+page.getHtml().xpath("//section[@class='s_sodium']/a["+i+"]/dl/dd[3]/text()"));//厂商
					i++;
				}
				//相关疾病
				page.putField("relation_disease", page.getHtml().xpath("//section[@class='s_disease']/div/a/text()").all());
			}
		}else if(page.getUrl().regex(fangan).match()){
			System.out.println(page.getUrl());
			page.putField("fangan_title", page.getHtml().xpath("//section[@class='s_drugfloor clears']/h1/text()"));
			page.putField("fangan_shiyong", page.getHtml().xpath("//div[@class='jj']/text()"));
			// 药品信息
			int i = 3;
			boolean flag = true;
			while(flag){
				List<String> fangan_durg_name = page.getHtml().xpath("//div[@class='s_m1']["+i+"]/p[1]/a/text()").all();
				if (fangan_durg_name.size() == 0) {
					break;
				}
				System.out.println("fangan_durg_name: "+fangan_durg_name);//药品明
				System.out.println("fangan_durg_image: "+page.getHtml().xpath("//div[@class='s_m1']["+i+"]/dt/a/img/@src").all().toString());//图片
				System.out.println("fangan_durg_link: "+page.getHtml().xpath("//div[@class='s_m1']["+i+"]/p[1]/a/@href").all());//链接
				System.out.println("fangan_durg_company: "+page.getHtml().xpath("//div[@class='s_m1']["+i+"]/p[2]/text()").all());//厂商
				System.out.println("fangan_durg_advantage: "+page.getHtml().xpath("//div[@class='s_m1']["+i+"]/p[3]/text()").all());//优点
				
				i += 2;
			}
			int j = i - 1;
			while(flag){
				if (null == page.getHtml().xpath("//div[@class='s_m']["+j+"]/div[@class='h']/b/text()").get()) {
					break;
				}
				System.out.println(page.getHtml().xpath("//div[@class='s_m']["+j+"]/div[@class='h']/b/text()").get());
				System.out.println(page.getHtml().xpath("//div[@class='s_m']["+j+"]/div[@class='b']/html()").get());
				j++;
			}
			//常用药
			i = 1;
			while(flag){
				String fangan_changyong_name = page.getHtml().xpath("//section[@class='s_sodium']/a["+i+"]/dl/dd/h3/text()").get();
				if (null == fangan_changyong_name) {
					break;
				}
				System.out.println("fangan_changyong_name: "+fangan_changyong_name);//药品名
				System.out.println("fangan_changyong_image: "+page.getHtml().xpath("//section[@class='s_sodium']/a["+i+"]/dl/dt/img/@src"));//图片
				System.out.println("fangan_changyong_indications: "+page.getHtml().xpath("//section[@class='s_sodium']/a["+i+"]/dl/dd[2]/text()"));//功能主治
				i++;
			}
		}
	}
	public static void main(String[] args) {
		Spider.create(new JibingAction()).addUrl("http://m.120ask.com/jibing/touyun/").thread(1).run();
	}

}
